{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyalink.alink import *\n",
    "useLocalEnv(1)\n",
    "\n",
    "from utils import *\n",
    "import os\n",
    "import pandas as pd\n",
    "\n",
    "pd.set_option('display.max_colwidth', 1000)\n",
    "\n",
    "DATA_DIR = ROOT_DIR + \"wordvec\" + os.sep\n",
    "\n",
    "WIKI_DEPENDENCY = \"deps.words\";\n",
    "GLOVE_6B_100D = \"glove.6B.100d.txt\";\n",
    "\n",
    "ORIGIN_FILE = \"三国演义.txt\";\n",
    "W2V_MODEL_FILE = \"w2v_model.ak\";\n",
    "\n",
    "def getWikiDependency() :\n",
    "    return TextSourceBatchOp()\\\n",
    "            .setFilePath(DATA_DIR + WIKI_DEPENDENCY)\\\n",
    "            .setTextCol(\"txt\")\\\n",
    "            .select(\"SUBSTRING(txt FROM 1 FOR POSITION(' ' IN txt)-1 ) AS word, \"\n",
    "                    + \"SUBSTRING(txt FROM POSITION(' ' IN txt) + 1 ) AS vec\");\n",
    "\n",
    "\n",
    "def getGlove6B100d() :\n",
    "    return TextSourceBatchOp()\\\n",
    "            .setFilePath(DATA_DIR + GLOVE_6B_100D)\\\n",
    "            .setTextCol(\"txt\")\\\n",
    "            .select(\"SUBSTRING(txt FROM 1 FOR POSITION(' ' IN txt)-1 ) AS word, \"\n",
    "            + \"SUBSTRING(txt FROM POSITION(' ' IN txt) + 1 ) AS vec\");\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#c_1_2\n",
    "\n",
    "for word2vec in [getWikiDependency(), getGlove6B100d()] :\n",
    "    for metric in [\"EUCLIDEAN\", \"COSINE\"] :\n",
    "        VectorNearestNeighbor()\\\n",
    "            .setIdCol(\"word\")\\\n",
    "            .setSelectedCol(\"vec\")\\\n",
    "            .setMetric(metric)\\\n",
    "            .setOutputCol(\"similar_words\")\\\n",
    "            .setTopN(7)\\\n",
    "            .fit(word2vec)\\\n",
    "            .transform(word2vec.filter(\"word='king'\"))\\\n",
    "            .select(\"word, similar_words\")\\\n",
    "            .lazyPrint(-1, metric);\n",
    "        BatchOperator.execute();\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#c_1_3\n",
    "\n",
    "getWikiDependency()\\\n",
    "    .filter(\"word IN ('man', 'woman', 'king', 'queen')\")\\\n",
    "    .lazyPrint(-1);\n",
    "\n",
    "getGlove6B100d()\\\n",
    "    .filter(\"word IN ('man', 'woman', 'king', 'queen')\")\\\n",
    "    .lazyPrint(-1);\n",
    "\n",
    "BatchOperator.execute();"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#c_1_3_1\n",
    "vec_man = DenseVector(\n",
    "    [\n",
    "        -0.00220404170083,0.0678135463787,0.0415827872463,-0.0145794269917,-0.0180352093149,0.144706288126,\n",
    "        -0.042149784084,-0.0216009491719,0.0638050780007,-0.0129117679223,0.151791574806,-0.0337671071952,\n",
    "        -0.0565042238321,0.075027992403,0.0542110567498,-0.0134639105297,-0.0519148101319,-0.103488516415,\n",
    "        -0.0298512060324,0.0140816291291,0.0432066082216,-0.0303133175126,-0.0883812208024,0.062041409882,\n",
    "        -0.044512874738,0.0710490513363,-0.0567426161127,-0.0569431482211,0.0664918821847,-0.00548356590169\n",
    "        ,0.00485244226516,0.0314308266153,0.0779908678907,-0.0283226695098,0.0350599686745,0.0332841635629,\n",
    "        -0.0607813363988,0.024102437776,0.0405754168265,-0.0247784864016,0.0431761751645,0.0862874989335,\n",
    "        0.0128255409272,0.0690049643344,0.0415603247518,-0.0194483538295,-0.0272676568636,-0.0585464993425,\n",
    "        -0.00153650708527,0.0179571340315,-0.0639164847275,0.00511039865392,-0.0106836333846,\n",
    "        -0.0404940808584,-0.0407839194974,0.0181438988046,0.0341792215101,-0.0105126285875,0.105609229508,\n",
    "        0.0762344457381,-0.0607018119222,-0.00303642204236,-0.0812955718233,-0.0164353000274,\n",
    "        -0.047065085104,-0.0218728540451,0.11017690531,-0.0706059605168,0.0443634266897,0.00945761594138,\n",
    "        -0.0247411696768,-0.147140570395,0.0117324869596,0.0322894735835,0.0834005249394,-0.0255103289652,\n",
    "        -0.047516146486,-0.0727475057611,0.136698408827,-0.0337493545785,0.0436060420961,0.113064603901,\n",
    "        -0.0783326963356,0.126881574122,-0.0566460636161,-0.00485932593284,-0.124211797958,0.00520622655395\n",
    "        ,-0.0841312806081,0.004456993672,0.0160320620208,0.0449710009368,-0.0171330865509,-0.0630871839216,\n",
    "        0.0419059573289,-0.0361901582176,-0.0500458943575,0.0579496128952,-0.042589976517,0.049046313351,\n",
    "        -0.0349088902839,-0.0228651890856,0.00574351493109,-0.148883225213,0.00681972202772,\n",
    "        -0.0342051258384,0.104763806427,-0.0229291347353,-0.0633724938319,-0.0688037076294,0.0442623454644,\n",
    "        -0.00582756813642,0.0883218038814,0.0774441597578,0.0557479261333,0.0337419274634,0.0232256758929,\n",
    "        0.0116797725572,0.0445858778452,0.138361901466,0.0417927391105,0.0814153114111,0.0447528973609,\n",
    "        0.00998729605424,0.0480103213656,0.0885399074573,0.0713434187041,0.00199264064347,0.0680106366533,\n",
    "        -0.0147074994403,-0.0373772285937,-0.043165849663,0.00223882233752,0.0466046039662,0.0441302877344,\n",
    "        -0.0443967581332,0.0776084620363,0.00467093081745,-0.0657792414309,-0.0361483127641,0.0349045427043\n",
    "        ,0.0420131976253,-0.00180841195853,-0.0023663513387,0.0141508281042,0.0455017679446,0.0515291435928\n",
    "        ,0.0187927750578,-0.0236693101598,-0.0991280752401,-0.026072615925,-0.0035715366297,\n",
    "        -0.0356289581528,-0.00706137499303,-0.028393317678,-0.0226537880282,-0.0562448182502,\n",
    "        -0.00793795572701,0.0294197087586,-0.0522501172074,-0.0132137435543,-0.024405500303,-0.106365889505\n",
    "        ,-0.0284715741106,-0.112262113169,-0.027080167494,0.0371308657505,-0.0164880144299,0.0120855466518,\n",
    "        -0.0615039403557,-0.0636583471896,0.0413688501009,-0.0335770817374,0.0115250711836,0.00234189620353\n",
    "        ,0.00517325740876,-0.0318260940593,-0.0752989915305,0.0732786350672,0.0173156848935,0.0220239324357\n",
    "        ,0.0362946812768,-0.0878689310079,0.133256756137,-0.0165050424499,0.00116569477643,-0.0684093459311\n",
    "        ,-0.0816989909791,0.0488830979674,0.045251238671,-0.0607186587931,-0.0260135613023,0.0855552940716,\n",
    "        -0.12988285323,-0.0763797273559,-0.0939136969744,-0.0742133646781,0.0111490055494,-0.00542288093664\n",
    "        ,-0.0472696024937,-0.00213194433937,-0.00409415192574,0.025322114999,-0.0282545574296,\n",
    "        0.0245714329238,-0.00910890382877,0.0557037257409,0.0365459351471,0.0267329045746,0.0915252454395,\n",
    "        0.120522333232,-0.0476376975653,0.0422655383904,0.0116381082528,-0.020368048061,0.0774028577517,\n",
    "        0.172846359293,0.0562408329689,0.0931919987632,0.0471446095806,-0.0731395125204,0.0468344822368,\n",
    "        0.0484217110839,-0.073229905946,0.050022344968,0.0151862766422,-0.0681736708878,0.0626352167938,\n",
    "        0.0696656152823,-0.0243629302529,0.00182996870731,0.00116949890857,0.159671381657,0.0205299954006,\n",
    "        0.0500475246998,0.0362838123279,-0.0326323889233,0.079122506627,-0.0585609912745,0.0344496771901,\n",
    "        0.0229575751518,0.00719017203825,0.0602706769466,0.01418578989,-0.0499381106136,-0.123776315403,\n",
    "        -0.0922469436507,0.0141167720641,-0.00585727659692,-0.0237772750528,0.0728150743939,\n",
    "        -0.0151895373269,-0.00172906863109,0.0540235673801,0.0352646672133,-0.0869593811287,\n",
    "        -0.0638471046033,-0.0194309635111,0.0705387541824,-0.0673945483962,0.0364571720639,-0.0613354716468\n",
    "        ,0.0492368822562,-0.0462057135392,-0.0107028351944,0.0119197951801,0.0393289295294,0.048293276337,\n",
    "        -0.0136312923437,-0.179455586009,0.052450830465,-0.00182326618878,0.0396665915439,0.00621885029914,\n",
    "        -0.0347118000093,0.0515389256469,0.0218607170521,-0.0478699307748,-0.0394441403884,-0.0236352541197\n",
    "        ,-0.00943768953494,-0.028646745338,-0.0799038029084,0.0192313371485,0.054759395225,0.0405284991968,\n",
    "        0.0752306983012,0.0205116993365,0.0747990198782,-0.0646151769968,0.00833883879464,0.050741325942,\n",
    "        0.0133485185215,-0.0694659889196,0.0447963731568,-0.000302881377808,-0.0743483207944,\n",
    "        -0.0796594327059,-0.0164630158472,-0.0441532936764\n",
    "    ]\n",
    ")\n",
    "\n",
    "vec_woman = DenseVector(\n",
    "    [                   \n",
    "        -0.0923953735088,0.062354665309,-0.0682686456655,0.048195826923,-0.0095468221514,0.105997991417,\n",
    "        -0.014845581153,0.0151306377161,0.072575366664,0.00120951798016,0.149595694754,-0.0258451283883,\n",
    "        0.0146608603837,0.0479136861009,-0.00676829247491,-0.0733037873761,-0.113669992093,-0.0400013940617\n",
    "        ,0.0148815991303,0.0490117198657,0.0486700636239,0.0238102841857,-0.0545149237669,0.068213075072,\n",
    "        -0.0436013051345,0.133555174326,-0.0357794010668,-0.0117240231214,0.0670014989217,-0.0148994366047,\n",
    "        -0.010707458591,0.00960856725532,0.0514741489129,-0.0597749205643,0.0573469658668,0.00388170886656,\n",
    "        -0.0786130087425,0.0356509369478,0.0715052897102,0.0058419444019,-0.0451022257023,0.0326794538216,\n",
    "        0.0290673652421,0.0376176900219,-0.00313013374049,-0.0558396992744,-0.0120471558319,\n",
    "        0.00773083003938,0.0160659045263,0.0284042914317,-0.0337920662914,-0.00866609684962,\n",
    "        -0.00168701345049,-0.0280652079027,-0.0222076557106,0.0599160767324,-0.00770630351199,\n",
    "        -0.0667687541827,0.144065220098,0.0177952819787,-0.00562394988222,-0.02769559485,-0.108722151099,\n",
    "        -0.0929776641416,-0.109614367851,-0.0185979683296,0.0672939305944,-0.0590288338919,0.0495305502528,\n",
    "        0.00501318789593,-0.0188578123087,-0.129812220429,-0.0227613034758,0.0181184146892,0.0540226780773,\n",
    "        -0.0324755234645,-0.0495677688293,-0.1263689018,0.14630982614,-0.0534362711042,0.0445444615969,\n",
    "        0.0700842947491,-0.180258827904,0.0878787191569,-0.0861760979163,-0.0857423385612,-0.102763748571,\n",
    "        0.025485463158,-0.0720994148213,0.0178841263226,0.000425698188706,-0.0197947942607,0.024358271983,\n",
    "        -0.109326738575,-0.039054121259,-0.021232426097,-0.0176808820222,0.0184684751256,-0.00301950709596,\n",
    "        0.0458568880836,-0.0280156403053,-0.0652055739684,-0.0428989545774,-0.124810523983,\n",
    "        -0.000185749854299,-0.0701319756905,0.0403401345623,-0.0182767222751,-0.0878262358186,\n",
    "        -0.0598584479688,0.0647704425,-0.0495142564059,0.0499855773658,0.0380457894091,0.0419012566065,\n",
    "        -0.014114416214,0.0751813530779,-0.0190830104238,-0.00171359814801,0.082342927562,0.0802993361363,\n",
    "        0.134567107973,0.0650849995016,0.0141840509701,-0.00596440552468,0.0295458897975,0.0468542430261,\n",
    "        0.0334896867964,0.0427727201427,0.00837469426195,-0.0867474116417,-0.0196050995803,0.0606614773481,\n",
    "        0.0577649459175,-0.00796528992011,-0.00462316465616,0.133496859505,0.0233127215566,-0.0632115501401\n",
    "        ,-0.0167939822101,0.0507690541289,0.0441614704384,-0.0453769914148,0.0339716416353,-0.0274534168313\n",
    "        ,0.0892136140009,0.0720007941692,0.0383738960308,-0.065645679348,-0.0230317813338,0.0215128517773,\n",
    "        -0.0497991414548,-0.0185619503524,-0.00282655364621,0.0283879975848,-0.0389481254973,\n",
    "        -0.0275192782755,0.0505514026376,0.0402641537817,-0.0335474870742,-0.00767920427194,\n",
    "        -0.0159605948213,-0.0863086783755,0.0770839598496,-0.109090906581,0.00196075007788,0.0114959092652,\n",
    "        -0.0207082787704,0.00355531738666,-0.0648733510065,-0.0437623569472,0.0809910528144,\n",
    "        -0.0348729486384,-0.042169333266,0.00687686094931,0.00771470770669,0.00937685160144,\n",
    "        -0.0327782459879,0.112907268546,0.0707099784689,-0.0238847213388,0.0190627717508,-0.0365002751551,\n",
    "        0.0650844849591,-0.0474641474415,-0.0371004032624,-0.0441911423911,-0.0650795110479,0.0596349649954\n",
    "        ,0.0679506583803,-0.104315780363,-0.0358552103333,0.0688571108087,-0.125921249797,-0.0793501766777,\n",
    "        -0.0764694245514,-0.0625042256718,-0.0398351968236,-0.00139784054712,-0.0992506236718,\n",
    "        -0.0349307489162,-0.0105265111336,0.0368045413061,-0.036131176645,0.00701733106073,0.065765396244,\n",
    "        0.0260879924637,0.00788244857235,-0.00236672413616,0.0336365029324,0.0539302319356,-0.0723362759005\n",
    "        ,0.0242152291589,0.0638574725328,-0.0468681356745,0.0453092433146,0.0645030518971,0.0179481011109,\n",
    "        0.0822961041915,0.101609458156,-0.0258916087304,0.0642037596573,0.0292023468999,-0.083472176907,\n",
    "        0.0748539325129,0.0146876165954,-0.030909255995,0.065078653477,0.0733888584082,-0.0368355853722,\n",
    "        0.049940297623,-0.0325051954172,0.104120597229,-0.00500769944225,-0.0448897196363,-0.036414175038,\n",
    "        0.0349345222282,0.15395078275,-0.0840393742922,0.04540717791,-0.0183967821994,-0.0147565652948,\n",
    "        0.0229129220087,-0.0174109187067,0.0180155061827,-0.113339312759,-0.11662055049,0.0344210087805,\n",
    "        -0.0368690306369,-0.0361822878699,-0.0173210452777,-0.0278091372356,-0.0549642909121,\n",
    "        0.0831401254593,0.0164374042349,-0.0477666984507,-0.0735370466576,-0.0335068382141,0.0567238548597,\n",
    "        -0.0508843116563,-0.0149018378032,-0.0675674957076,0.0611645284309,0.0208751620651,\n",
    "        -0.00623368278345,-0.0568559207764,0.0579577278531,0.0681614493045,-0.0232475461691,-0.139026819618\n",
    "        ,0.0638504404515,-0.0417685046331,-0.00878392708961,0.048339727318,-0.0840908285455,0.0444873473758\n",
    "        ,0.0425456353716,-0.0316160659207,-0.0688315551962,0.0310598454429,0.0188993187396,-0.0333886649458\n",
    "        ,-0.0515029632947,0.0329230039537,0.0189056647642,0.0479807481443,0.0869861593769,-0.0252765588897,\n",
    "        0.0200299401981,-0.0247308007767,0.0251559844229,-0.0181552902374,0.0185816744828,-0.0372676295855,\n",
    "        0.0386944560286,-0.0658888864518,-0.039044516465,-0.0329171724717,-0.0518072294457,\n",
    "        0.0179050510523\n",
    "    ]\n",
    ")\n",
    "\n",
    "vec_king = DenseVector(\n",
    "    [\n",
    "        0.0330381329194,0.0665698704227,0.0262800220171,-0.0573233783314,0.0135350782339,0.0539792823665,\n",
    "        -0.0321528819666,-0.0233145564785,0.00133663715962,0.02802799633,0.0941987111865,0.00248237383102,\n",
    "        0.0470419519499,0.103658297617,0.0284782144395,0.0231670424649,-0.00882697825621,-0.0873673438906,\n",
    "        -0.0683468802995,-0.0462785168681,-0.0534324459156,-0.00981769171647,0.0455728191718,\n",
    "        -0.0544375102867,-0.0192650965594,0.142169300564,0.0191646401836,0.0453715726781,0.0132457238226,\n",
    "        0.0238588898636,-0.0716704511245,-0.049105145688,0.119388898336,-0.0108634726252,-0.0840693043372,\n",
    "        -0.0117917763105,0.00234220214387,0.0185465497917,0.0442281722014,0.0254097894593,0.0349079231033,\n",
    "        0.0927918213123,0.0419685712304,0.00143726040646,-0.0181866089236,-0.0293169083676,0.00289120791854\n",
    "        ,0.0412899066451,-0.00506653900628,-0.0222759510252,-0.00194271283549,-0.0767471692652,\n",
    "        -0.0921146585665,0.00433781310415,0.0158220460582,-0.0100643271374,0.0112636294179,\n",
    "        -0.00254645231657,-0.115208110896,-0.0190087826172,0.0234410447391,0.0215248644015,\n",
    "        0.000333575241208,0.0233684558297,-0.0776838165032,-0.0638970967167,-0.0249542314761,\n",
    "        -0.0100251124392,-0.0396964549301,0.0194451504289,-0.0408672222599,-0.0939577493815,\n",
    "        -0.0254124593962,-0.0222377375533,0.040674319319,0.00156474989335,-0.00562972882073,\n",
    "        -0.0923975050066,-0.0037240613597,-0.0700389528087,0.0437459148492,-0.0383488042808,\n",
    "        -0.0227029740578,0.0856896222922,-0.0528413886348,0.0869177932654,-0.108374240534,-0.0430757606878,\n",
    "        0.0234325343152,-0.0852679391334,0.0398711689258,0.0553487931242,0.0518913917122,0.011713013172,\n",
    "        -0.0292461550398,0.0413067606217,-0.0264916145162,-0.0532630717936,0.0333600271866,0.0153077494632,\n",
    "        -0.00971740221173,0.0125703966584,-0.0324509136728,-0.0187801692703,-0.021092334624,\n",
    "        -0.0305355676905,0.00352782099769,-0.0139532571006,-0.0183539805929,-0.0374970944103,\n",
    "        0.0176983442134,-0.0147283731563,0.0271492533485,0.0611023402686,0.028643249914,-0.0351704112746,\n",
    "        0.120878889996,0.0430932821487,-0.0071095413465,0.0341006009337,-0.0237223893397,0.0256866285415,\n",
    "        -0.0343402277703,0.0487924293288,0.0210194119725,0.00124519182086,0.0485306086417,0.0750082059892,\n",
    "        -0.0373182086381,0.0615213534905,0.00246602046752,0.0265133077535,-0.0168888527201,0.0442101501274,\n",
    "        0.0203390786766,-0.0685734911937,0.0953204184258,0.0034452198249,-0.064757317011,0.0399227320822,\n",
    "        0.133338818015,-0.0327686361637,0.0116489346865,-0.0183264468686,0.0438325209274,-0.023651802883,\n",
    "        0.0995132205822,0.0206986858026,0.0342162425756,-0.0435264794104,-0.00740073133945,-0.0057937630689\n",
    "        ,-0.00267561051402,0.00654418220833,0.0277641731903,0.0659632941337,0.057346072795,-0.0114360072188\n",
    "        ,-0.0240195866907,-0.0124515844665,-0.0359702242466,-0.055722918032,-0.0463884848941,\n",
    "        0.0683348655834,-0.155736918654,-0.0843628305249,0.0482928173867,-0.00038997765818,-0.0406199193547\n",
    "        ,-0.0918284747052,-0.0667976494144,0.0372963485297,0.0383634889337,-0.0168709975171,\n",
    "        -0.0798070838215,0.0714233150903,0.015313089337,0.030887665619,0.0320717826333,0.0237357390242,\n",
    "        -0.00285900180471,-0.0253363661946,-0.0554786188059,0.0294742677735,-0.046435542532,\n",
    "        -0.00314868995815,-0.186628756049,-0.0952353141872,0.182545421306,0.0334120909561,0.014159509726,\n",
    "        -0.0694415544277,0.00423101562823,0.0165244063335,-0.0298050062068,0.0334703289547,0.0128066860739,\n",
    "        -0.0135522659527,-0.0458431502827,-0.139522391871,-0.0334446308121,-0.0507084427953,0.0428137731297\n",
    "        ,-0.0329196544695,-0.0102113405379,0.00478185698452,0.0596383804931,-0.0960895271235,0.125720319948\n",
    "        ,0.169201911044,-0.0187761643649,0.042368894894,0.0135659493793,0.0458503257381,-0.0126451548916,\n",
    "        -0.0886390682094,0.00128323842166,0.0174855836168,0.0806846587057,-0.090234355506,0.0810689627479,\n",
    "        0.0667399120289,0.0380170646212,0.00552376570008,-0.016451650553,0.0818218849532,0.0307398178632,\n",
    "        -0.0350374150429,0.000343253762464,0.0473021039265,-0.128344033563,0.0345044288896,0.00158277196741\n",
    "        ,0.0237108752369,0.0361431026608,0.00922279640136,0.0827633714519,0.0621466193378,-0.0156743651735,\n",
    "        -0.00911149340692,-0.0758977955895,-0.00623830756244,-0.0275959671657,0.00318173042727,\n",
    "        0.0828069247976,-0.103742066887,-0.0408753989416,0.0521663952127,-0.00092496626414,-0.0101921503664\n",
    "        ,-0.0297047167021,-0.0108247585402,0.00731662832716,0.0719734889624,0.0539732750085,-0.107010570263\n",
    "        ,-0.0747198528042,0.00864725812876,-0.0168704969039,-0.0366178508155,-0.0121999429138,\n",
    "        0.0788080268084,0.0223829153722,0.0517285255614,0.0268236879179,0.130153750167,-0.0144270040289,\n",
    "        0.0194927086799,-0.119658561963,-0.122853475203,0.0846531861626,-0.155554862331,-0.0860997913482,\n",
    "        -0.0621898389414,-0.17728314255,0.0893194015052,0.0185023289618,-0.135035896656,0.00151218551067,\n",
    "        0.0556621769676,0.133543235059,-0.0642887430854,0.0156474989335,-0.00345423086193,0.0526394746568,\n",
    "        -0.0207610955776,0.0352259793363,0.0648282372099,0.0572200851477,-0.0698799246922,-0.00263205716837\n",
    "        ,0.13508078497,-0.04197641417,-0.00110151584153,-0.0527783113755,0.0265091359771,-0.0877034221976,\n",
    "        0.00353282712937,-0.013237880883,-0.0142381059934\n",
    "    ]\n",
    ")\n",
    "\n",
    "vec_queen = DenseVector(\n",
    "    [\n",
    "        0.0199859507483,0.1526205478,0.000618664683489,0.0165901690411,0.074417065154,0.0655692910471,\n",
    "        -0.053041765773,-0.0687128661553,0.0255264366915,-0.0386942759625,0.0730841311552,-0.0922892844064,\n",
    "        -0.0229324696981,0.0638844371888,-0.0150418480785,0.0189000085504,-0.0429470228238,-0.0769546175135\n",
    "        ,-0.0130125014973,0.00219606209615,-0.0236931981237,0.0251165219562,0.0139353627365,\n",
    "        -0.0302018982472,-0.00408697949146,0.135300465049,0.0668865806516,-0.00476063659126,\n",
    "        -0.0252876621483,-0.0492251656617,-0.0638152225958,-0.0311961864179,0.0566992355301,0.021559081508,\n",
    "        -0.0473516309267,-0.0499675632819,0.11703903223,0.0025697892932,0.00441677596514,0.0769546175135,\n",
    "        -0.00425179871621,0.0394772857982,0.0202733967404,-0.00569297928158,0.0428439910553,\n",
    "        0.00924504690749,-0.0506530721955,0.067472692353,-0.0232486760918,0.00612564949956,0.0445348498327,\n",
    "        -0.10462986707,-0.0260359066633,-0.0325488734398,0.00375765709034,0.069927756088,0.00613465687809,\n",
    "        -0.016161607452,-0.105185638128,0.0101784957671,0.0802712291041,-0.0216829724689,-0.0205973463193,\n",
    "        0.00489606331763,-0.0986225776848,-0.0480635298792,-0.0312830997196,-0.0945687832722,\n",
    "        0.0230062669924,0.0383120154595,-0.0546972271329,-0.0758915888224,-0.00866667839113,\n",
    "        -0.0110841323702,0.00186942610617,0.0170563403863,0.0174003590366,-0.0958972925938,0.0187356633982,\n",
    "        -0.0299860372108,-0.0212983416031,0.00892915656208,0.00700900469258,0.0946809804434,\n",
    "        -0.0435268135578,0.00714854004776,-0.0369166619077,0.0342996233832,0.0416847256356,0.0207027484505,\n",
    "        0.0526122560387,0.0445416448726,0.0493632787992,-0.0214533633283,0.0302292364312,-0.0469761654636,\n",
    "        0.0588218163804,-0.0169416148281,0.0379850634211,0.0411926383242,-0.0581699666181,-0.0157731840057,\n",
    "        -0.0222235732051,0.00352472944195,-0.0241168609631,0.0057759419786,0.0748854488378,0.043092563098,\n",
    "        -0.0111934851061,-0.0778250147234,-0.00970789974477,-0.009734605832,-0.000632570811751,\n",
    "        0.0428188652099,0.0544333267443,-0.0271946980105,0.0852948179388,0.0416643405158,-0.0533714042225,\n",
    "        0.128496575749,0.0231078765432,0.0491968793326,-0.0049483693228,0.0792908470616,-0.0303152015877,\n",
    "        0.0316332813131,-0.0184588050264,0.101064683437,0.0119540555075,0.0208304319919,-0.0388508779296,\n",
    "        0.0393660367721,-0.0616641973923,-0.00849601227154,-0.0467373909204,-0.00932785158033,\n",
    "        0.0759282504332,0.0299090794328,0.00196234432684,0.0687089155507,0.0694321922445,0.0461394274051,\n",
    "        -0.0353237781249,-0.0585525431695,-0.0481272136257,0.0317462686052,0.165568101357,0.0869850446195,\n",
    "        0.0442423470666,0.0781791469215,0.00161247878169,-0.0178734834459,0.0611762187097,0.0285407480166,\n",
    "        -0.0262222171772,0.0767811069586,-0.00349628508868,-0.00613118034602,0.0282738451685,\n",
    "        -0.0766406234583,0.0388758457508,0.0050253271008,0.00336654723296,0.0447516590142,-0.118273201113,\n",
    "        -0.0591146351948,0.0141709767961,-0.0156278017557,-0.0962939332976,-0.0576453263245,\n",
    "        0.00461794075239,0.0248338166896,0.042656416348,0.00735823814099,-0.116470777261,0.126713114799,\n",
    "        0.00540853574894,-0.00261466816169,0.0717761649776,0.0596139916188,-0.054438225494,-0.0806844623473\n",
    "        ,0.072645297994,-0.0136325883984,-0.0536226626763,0.0188807295998,-0.0959965317818,-0.04431535424,\n",
    "        0.149231245084,0.0243888205852,-0.0337752991381,-0.00974803788771,0.0531040273018,-0.0498515735302,\n",
    "        -0.0612596554793,0.0206042993834,-0.0412204505807,0.0134407470381,0.096124057299,-0.14383155869,\n",
    "        -0.00114614941235,-0.0758740481379,0.0221914942955,-0.0283618646394,0.00667573168684,\n",
    "        -0.00922971856157,-0.011458491664,-0.015071398601,0.0886578886397,0.116466036536,0.0829227169008,\n",
    "        0.0263431056786,0.0541117475282,0.0670427085462,0.0810667228504,-0.0576652373718,-0.0213899956302,\n",
    "        -0.0312309517386,0.0512192728497,-0.0627393939457,0.0873235324233,0.106129990656,0.0314580324922,\n",
    "        0.00744894402306,-0.0164963026754,0.0926408882176,0.0149500360271,0.0327484579852,0.0299005461268,\n",
    "        -0.0543593714258,-0.0530444521841,0.057176310544,-0.0461425878888,-0.0161775678946,-0.0101426242772\n",
    "        ,-0.0337007117228,0.0797318925614,0.115810078144,-0.0557607298966,-0.0341913768166,\n",
    "        -0.00794830044703,-0.0359602995413,-0.0353556990102,0.00737388253529,0.0849293079994,\n",
    "        -0.109218099276,-0.066264439436,0.0851826207676,0.0488566532627,-0.010613694372,-0.0493604343639,\n",
    "        0.0270755477751,0.0503324411246,0.0729935832973,0.0779000762112,-0.0777292520674,-0.0369526914218,\n",
    "        0.0396248803868,0.0754470667905,-0.0569478075728,-0.000792965359322,0.0110006956006,\n",
    "        -0.0244976992485,-0.0154890565215,-0.0568800151975,0.0403171843404,0.0681994455789,0.0543968231576,\n",
    "        -0.072004825973,-0.14427829306,0.0705822922615,-0.12334672559,-0.117944668833,0.00840135578485,\n",
    "        -0.165265801092,0.107454075301,0.0254774491943,-0.116934104171,-0.0859812749976,0.107266184545,\n",
    "        -0.00975767736298,-0.11928281763,-0.00672203277299,0.0101364613339,0.0463544983206,-0.086025205721,\n",
    "        0.0207670642938,0.0134198878457,0.136448668776,-0.0568893386245,-0.0168189880607,0.0293896539373,\n",
    "        -0.0191718101482,0.0392143335547,0.00512172185353,0.0049556384353,-0.0369394173903,\n",
    "        -0.00211562778609,0.0267236279156,0.0352932794572\n",
    "    ]\n",
    ")\n",
    "\n",
    "print(\"'man' vector normL2 : \" + str(vec_man.normL2()));\n",
    "print(\"'woman' vector normL2 : \" + str(vec_woman.normL2()));\n",
    "print(\"'king' vector normL2 : \" + str(vec_king.normL2()));\n",
    "print(\"'queen' vector normL2 : \" + str(vec_queen.normL2()));\n",
    "print(\"'man - woman' normL2 : \" + str(vec_man.minus(vec_woman).normL2()));\n",
    "print(\"'king - queen' normL2 : \" + str(vec_king.minus(vec_queen).normL2()));\n",
    "print(\"(man - woman) - (king - queen) normL2 : \" \n",
    "      + str(vec_man.minus(vec_woman).minus(vec_king.minus(vec_queen)).normL2()));\n",
    "\n",
    "df = pd.DataFrame(\n",
    "    [\n",
    "        [\"king\", str(vec_king)],\n",
    "        [\"king-man+woman\", str(vec_king.minus(vec_man).plus(vec_woman))],\n",
    "        [\"queen\", str(vec_queen)],\n",
    "        [\"queen-woman+man\", str(vec_queen.minus(vec_woman).plus(vec_man))]\n",
    "    ]\n",
    ")  \n",
    "target_set = BatchOperator.fromDataframe(df, schemaStr='word string, vec string')\n",
    "\n",
    "VectorNearestNeighbor()\\\n",
    "    .setIdCol(\"word\")\\\n",
    "    .setSelectedCol(\"vec\")\\\n",
    "    .setMetric('EUCLIDEAN')\\\n",
    "    .setOutputCol(\"similar_words\")\\\n",
    "    .setTopN(5)\\\n",
    "    .fit(getWikiDependency())\\\n",
    "    .transform(target_set)\\\n",
    "    .select(\"word, similar_words\")\\\n",
    "    .print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#c_1_3_2\n",
    "vec_man = DenseVector(\n",
    "    [\n",
    "        0.37293,0.38503,0.71086,-0.65911,-0.0010128,0.92715,0.27615,-0.056203,-0.24294,0.24632,-0.18449,0.31398,\n",
    "        0.48983,0.09256,0.32958,0.15056,0.57317,-0.18529,-0.52277,0.46191,0.92038,0.031001,-0.16246,\n",
    "        -0.40567,0.78621,0.57722,-0.53501,-0.68228,0.16987,0.3631,-0.071773,0.47233,0.027806,-0.14951,\n",
    "        0.17543,-0.37573,-0.78517,0.58171,0.86859,0.031445,-0.45897,-0.040917,0.95897,-0.16975,0.13045,\n",
    "        0.27434,-0.069485,0.022402,0.24977,-0.21536,-0.32406,-0.39867,0.68613,1.7923,-0.37848,-2.2477,\n",
    "        -0.77025,0.46582,1.2411,0.57756,0.41151,0.84328,-0.54259,-0.16715,0.73927,-0.093477,0.90278,0.50889\n",
    "        ,-0.50031,0.26451,0.15443,-0.29432,0.10906,-0.26667,0.35438,0.049079,0.18018,-0.5859,-0.55542,\n",
    "        -0.28987,0.74278,0.3453,-0.028757,-0.22646,-1.3113,-0.5719,-0.52306,-0.1267,-0.098678,-0.53463,\n",
    "        0.28607,-0.37501,0.45742,0.045975,-0.24675,0.045656,-0.38302,-0.93711,0.039138,-0.53911\n",
    "    ]\n",
    ")\n",
    "\n",
    "vec_woman = DenseVector(\n",
    "    [\n",
    "        0.59368,0.44825,0.5932,0.074134,0.11141,1.2793,0.16656,0.2407,0.39045,0.32766,-0.75034,0.35007,0.76057,\n",
    "        0.38067,0.17517,0.031791,0.46849,-0.21653,-0.46282,0.39967,0.16623,-0.011477,0.044059,0.30325,\n",
    "        0.6153,0.47047,-0.44036,-1.5963,0.18433,0.23193,0.20452,0.51617,0.65734,-0.3452,0.23446,-0.62004,\n",
    "        -0.68741,0.28575,1.0605,0.46916,-0.85149,0.10154,0.21426,-0.20587,0.23636,0.21321,-0.21287,0.12107,\n",
    "        0.18766,-0.23282,-0.25499,-0.39631,0.84379,1.6801,-0.40941,-1.9976,-0.69868,0.21732,1.2197,0.55126,\n",
    "        0.44095,0.72588,-0.092053,-0.022406,0.72039,0.1076,0.84116,0.30312,-0.42544,0.056362,0.13109,\n",
    "        -0.071181,-0.10579,0.56677,0.54547,0.84113,0.14861,-0.62628,-0.68391,-1.0831,-0.088385,0.32167,\n",
    "        0.47794,0.091868,-1.2559,-1.2268,0.085401,0.36833,0.081566,-0.76611,0.87751,-0.22008,0.82401,\n",
    "        -0.092207,-0.45941,0.46571,-0.56018,-0.54648,0.15162,-0.30754\n",
    "    ]\n",
    ")\n",
    "\n",
    "vec_king = DenseVector(\n",
    "    [\n",
    "        -0.32307,-0.87616,0.21977,0.25268,0.22976,0.7388,-0.37954,-0.35307,-0.84369,-1.1113,-0.30266,0.33178,\n",
    "        -0.25113,0.30448,-0.077491,-0.89815,0.092496,-1.1407,-0.58324,0.66869,-0.23122,-0.95855,0.28262,\n",
    "        -0.078848,0.75315,0.26584,0.3422,-0.33949,0.95608,0.065641,0.45747,0.39835,0.57965,0.39267,-0.21851\n",
    "        ,0.58795,-0.55999,0.63368,-0.043983,-0.68731,-0.37841,0.38026,0.61641,-0.88269,-0.12346,-0.37928,\n",
    "        -0.38318,0.23868,0.6685,-0.43321,-0.11065,0.081723,1.1569,0.78958,-0.21223,-2.3211,-0.67806,0.44561\n",
    "        ,0.65707,0.1045,0.46217,0.19912,0.25802,0.057194,0.53443,-0.43133,-0.34311,0.59789,-0.58417,\n",
    "        0.068995,0.23944,-0.85181,0.30379,-0.34177,-0.25746,-0.031101,-0.16285,0.45169,-0.91627,0.64521,\n",
    "        0.73281,-0.22752,0.30226,0.044801,-0.83741,0.55006,-0.52506,-1.7357,0.4751,-0.70487,0.056939,\n",
    "        -0.7132,0.089623,0.41394,-1.3363,-0.61915,-0.33089,-0.52881,0.16483,-0.98878\n",
    "    ]\n",
    ")\n",
    "\n",
    "vec_queen = DenseVector(\n",
    "    [\n",
    "        -0.50045,-0.70826,0.55388,0.673,0.22486,0.60281,-0.26194,0.73872,-0.65383,-0.21606,-0.33806,0.24498,\n",
    "        -0.51497,0.8568,-0.37199,-0.58824,0.30637,-0.30668,-0.2187,0.78369,-0.61944,-0.54925,0.43067,\n",
    "        -0.027348,0.97574,0.46169,0.11486,-0.99842,1.0661,-0.20819,0.53158,0.40922,1.0406,0.24943,0.18709,\n",
    "        0.41528,-0.95408,0.36822,-0.37948,-0.6802,-0.14578,-0.20113,0.17113,-0.55705,0.7191,0.070014,\n",
    "        -0.23637,0.49534,1.1576,-0.05078,0.25731,-0.091052,1.2663,1.1047,-0.51584,-2.0033,-0.64821,0.16417,\n",
    "        0.32935,0.048484,0.18997,0.66116,0.080882,0.3364,0.22758,0.1462,-0.51005,0.63777,0.47299,-0.3282,\n",
    "        0.083899,-0.78547,0.099148,0.039176,0.27893,0.11747,0.57862,0.043639,-0.15965,-0.35304,-0.048965,\n",
    "        -0.32461,1.4981,0.58138,-1.132,-0.60673,-0.37505,-1.1813,0.80117,-0.50014,-0.16574,-0.70584,0.43012\n",
    "        ,0.51051,-0.8033,-0.66572,-0.63717,-0.36032,0.13347,-0.56075\n",
    "    ]\n",
    ")\n",
    "\n",
    "print(\"'man' vector normL2 : \" + str(vec_man.normL2()));\n",
    "print(\"'woman' vector normL2 : \" + str(vec_woman.normL2()));\n",
    "print(\"'king' vector normL2 : \" + str(vec_king.normL2()));\n",
    "print(\"'queen' vector normL2 : \" + str(vec_queen.normL2()));\n",
    "print(\"'man - woman' normL2 : \" + str(vec_man.minus(vec_woman).normL2()));\n",
    "print(\"'king - queen' normL2 : \" + str(vec_king.minus(vec_queen).normL2()));\n",
    "print(\"(man - woman) - (king - queen) normL2 : \" \n",
    "      + str(vec_man.minus(vec_woman).minus(vec_king.minus(vec_queen)).normL2()));\n",
    "\n",
    "df = pd.DataFrame(\n",
    "    [\n",
    "        [\"king\", str(vec_king)],\n",
    "        [\"king-man+woman\", str(vec_king.minus(vec_man).plus(vec_woman))],\n",
    "        [\"queen\", str(vec_queen)],\n",
    "        [\"queen-woman+man\", str(vec_queen.minus(vec_woman).plus(vec_man))]\n",
    "    ]\n",
    ")  \n",
    "target_set = BatchOperator.fromDataframe(df, schemaStr='word string, vec string')\n",
    "\n",
    "VectorNearestNeighbor()\\\n",
    "    .setIdCol(\"word\")\\\n",
    "    .setSelectedCol(\"vec\")\\\n",
    "    .setMetric('EUCLIDEAN')\\\n",
    "    .setOutputCol(\"similar_words\")\\\n",
    "    .setTopN(5)\\\n",
    "    .fit(getGlove6B100d())\\\n",
    "    .transform(target_set)\\\n",
    "    .select(\"word, similar_words\")\\\n",
    "    .print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#c_2\n",
    "\n",
    "source = TextSourceBatchOp().setFilePath(DATA_DIR + ORIGIN_FILE);\n",
    "\n",
    "source.lazyPrint(8);\n",
    "\n",
    "CHARACTER_DICT = [\n",
    "    \"曹操\", \"孔明\", \"玄德\", \"刘玄德\", \"刘备\", \"关羽\", \"张飞\",\n",
    "    \"赵云\", \"曹孟德\", \"诸葛亮\", \"张郃\", \"孙权\", \"张辽\", \"鲁肃\"\n",
    "]\n",
    "\n",
    "\n",
    "source\\\n",
    "    .link(\n",
    "        SegmentBatchOp()\\\n",
    "            .setSelectedCol(\"text\")\\\n",
    "            .setUserDefinedDict(CHARACTER_DICT)\n",
    "    )\\\n",
    "    .link(\n",
    "        StopWordsRemoverBatchOp().setSelectedCol(\"text\")\n",
    "    )\\\n",
    "    .link(\n",
    "        WordCountBatchOp().setSelectedCol(\"text\")\n",
    "    )\\\n",
    "    .orderBy(\"cnt\", 20, order = 'desc')\\\n",
    "    .print();\n",
    "\n",
    "if not(os.path.exists(DATA_DIR + W2V_MODEL_FILE)) :\n",
    "    source\\\n",
    "        .link(\n",
    "            SegmentBatchOp()\\\n",
    "                .setSelectedCol(\"text\")\\\n",
    "                .setUserDefinedDict(CHARACTER_DICT)\n",
    "        )\\\n",
    "        .link(\n",
    "            StopWordsRemoverBatchOp()\\\n",
    "                .setSelectedCol(\"text\")\\\n",
    "                .setStopWords([\n",
    "                    \"亦\", \"曰\", \"遂\", \"吾\", \"已\", \"去\", \"二人\", \"今\", \"使\", \"中\", \"知\",\n",
    "                    \"不\", \"见\", \"都\", \"令\", \"却\", \"欲\", \"请\", \"人\", \"谓\", \"不可\", \"闻\",\n",
    "                    \"前\", \"后\", \"皆\", \"便\", \"问\", \"日\", \"时\", \"耳\", \"不敢\", \"问\", \"回\", \"才\",\n",
    "                    \"之事\", \"之人\", \"之时\", \"料\", \"今日\", \"令人\", \"受\", \"说\", \"出\", \"已毕\",\n",
    "                    \"不得\", \"使人\", \"众\", \"何不\", \"不知\", \"再\", \"处\", \"无\", \"即日\", \"诸\", \"此时\",\n",
    "                    \"只\", \"下\", \"还\", \"上\", \"杀\", \"将军\", \"却说\", \"兵\", \"汝\", \"走\", \"言\", \"寨\",\n",
    "                    \"不能\", \"斩\", \"死\", \"商议\", \"听\", \"军士\", \"军\", \"左右\", \"军马\", \"引兵\", \"次日\",\n",
    "                    \"二\", \"看\", \"耶\", \"退\", \"更\", \"毕\", \"正\", \"一人\", \"原来\", \"大笑\", \"车胄\", \"口\",\n",
    "                    \"引\", \"大喜\", \"其事\", \"助\", \"事\", \"未\", \"大\", \"至此\", \"讫\", \"心中\", \"敢\"\n",
    "                ])\n",
    "        )\\\n",
    "        .link(\n",
    "            Word2VecTrainBatchOp()\\\n",
    "                .setSelectedCol(\"text\")\\\n",
    "                .setMinCount(10)\\\n",
    "                .setNumIter(50)\n",
    "        )\\\n",
    "        .link(\n",
    "            AkSinkBatchOp().setFilePath(DATA_DIR + W2V_MODEL_FILE)\n",
    "        );\n",
    "    BatchOperator.execute();\n",
    "\n",
    "\n",
    "word2vec = AkSourceBatchOp().setFilePath(DATA_DIR + W2V_MODEL_FILE);\n",
    "\n",
    "VectorNearestNeighbor()\\\n",
    "    .setIdCol(\"word\")\\\n",
    "    .setSelectedCol(\"vec\")\\\n",
    "    .setTopN(20)\\\n",
    "    .setOutputCol(\"similar_words\")\\\n",
    "    .fit(word2vec)\\\n",
    "    .transform(\n",
    "        word2vec\\\n",
    "            .filter(\"word IN ('曹操', '操', '玄德', '刘备', '孔明', '亮', \"\n",
    "                    + \"'卧龙', '周瑜', '吕布', '貂蝉', '云长', '孙权')\")\n",
    "    )\\\n",
    "    .select(\"word, similar_words\")\\\n",
    "    .print();"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
